#Introduction This document is an analysis of the classification and origin of 2410 brands of beer produced by 558 breweries across the continential United States.

Of particular interest was the high concentration of unique beer brand production in breweries on the west coast, the significantly above average number of IPA style beers, and majority of beers that cluster around low alcohol content and bitterness.

Colorado has 39 breweries reported in the data, followed by California with 37. Additional information on the destinations of beer produced in these states and further research could determine if this indicates a reigonally well established demand for local beer variety, corporate interests in the area, or an oversaturated production environment.

Of the 2410 brands represented in the data, 17.6% of them are IPA style. The next 10% are APA, and 5.5% American Amber/Pale Ale. The top 3 most populous styles of beer make up more than 1/3 of all the reported beer brands. Further research could determine if this stylistic majority represents a marketplace glut or genuine demand for beers in these styles.

(percentage from graph) of the beer brands in the data have an alcohol content of 5% or less, with a bitterness rating of 25, with only (percentage) of brands exceeding 10% ABV and 100 IBU. This indicates that most breweries believe that beers around this level of bitterness and alcohol have the highest rate of consumption. Further research could determine if the alcohol content and bitterness level of a beer is a factor in popularity, or if production cost or legal stipulations make production more attractive to breweries.

#'x' is the column of a data.frame that holds 2 digit state codes
stateFromLower <-function(x) {
  #read 52 state codes into local variable [includes DC (Washington D.C. and PR (Puerto Rico)]
  st.codes<-data.frame(
    state=as.factor(c("AK", "AL", "AR", "AZ", "CA", "CO", "CT", "DC", "DE", "FL", "GA",
                      "HI", "IA", "ID", "IL", "IN", "KS", "KY", "LA", "MA", "MD", "ME",
                      "MI", "MN", "MO", "MS",  "MT", "NC", "ND", "NE", "NH", "NJ", "NM",
                      "NV", "NY", "OH", "OK", "OR", "PA", "PR", "RI", "SC", "SD", "TN",
                      "TX", "UT", "VA", "VT", "WA", "WI", "WV", "WY")),
    full=as.factor(c("alaska","alabama","arkansas","arizona","california","colorado",
                     "connecticut","district of columbia","delaware","florida","georgia",
                     "hawaii","iowa","idaho","illinois","indiana","kansas","kentucky",
                     "louisiana","massachusetts","maryland","maine","michigan","minnesota",
                     "missouri","mississippi","montana","north carolina","north dakota",
                     "nebraska","new hampshire","new jersey","new mexico","nevada",
                     "new york","ohio","oklahoma","oregon","pennsylvania","puerto rico",
                     "rhode island","south carolina","south dakota","tennessee","texas",
                     "utah","virginia","vermont","washington","wisconsin",
                     "west virginia","wyoming"))
  )
  #create an nx1 data.frame of state codes from source column
  st.x<-data.frame(state=x)
  #match source codes with codes from 'st.codes' local variable and use to return the full state name
  refac.x<-st.codes$full[match(st.x$state,st.codes$state)]
  #return the full state names in the same order in which they appeared in the original source
  return(refac.x)
  
}

#1. How many breweries are present in each state?

#beer_path="/Users/wailunchung/Documents/GitHub/MSDS-6306-Doing-Data-Science/Unit 7/Beers.csv"
#brewery_path="/Users/wailunchung/Documents/GitHub/MSDS-6306-Doing-Data-Science/Unit 7/Breweries.csv"
#added some redirects for my machine, switch comments back when you run it on yours jtan
#beer_path="D:/SMU/DS 6306 Doing Data Science/DS_6306_case_study_01/DS_6306_case_study_01/Raw Data/Beers.csv"
#brewery_path="D:/SMU/DS 6306 Doing Data Science/DS_6306_case_study_01/DS_6306_case_study_01/Raw Data/Breweries.csv"
beer_path<-file.path(getwd(), "Raw Data", "Beers.csv")
brewery_path<-file.path(getwd(), "Raw Data", "Breweries.csv")

beer_df=read.csv(beer_path)
brewery_df=read.csv(brewery_path)

# returns string w/o leading whitespace
trim <- function (x)  sub("^\\s+", "", x)
brewery_df$State <- trim(brewery_df$State) 

# covert state to full state name
brewery_df$State_full<-stateFromLower(brewery_df$State)
count_by_state <- data.frame(table(brewery_df$State_full))

#ggplot(count_by_state, aes(x=reorder(Var1,-Freq),y=Freq,fill=Var1)) + 
#  geom_bar(stat="identity") + 
#  labs(title="Brewery Count by State",subtitle="",x="State",y="freq",caption="") +
#  theme(plot.title=element_text(hjust=0.5),axis.text.x = element_text(angle = 90, hjust = #1),legend.position="none")

#install.packages("maps")
# using maps to get coordinate and use ggplot to plot maps
#install.packages('ggthemes')
library(maps)
library(ggplot2)
library(ggthemes)
us_states <- map_data("state")
head(us_states,10)
##         long      lat group order  region subregion
## 1  -87.46201 30.38968     1     1 alabama      <NA>
## 2  -87.48493 30.37249     1     2 alabama      <NA>
## 3  -87.52503 30.37249     1     3 alabama      <NA>
## 4  -87.53076 30.33239     1     4 alabama      <NA>
## 5  -87.57087 30.32665     1     5 alabama      <NA>
## 6  -87.58806 30.32665     1     6 alabama      <NA>
## 7  -87.59379 30.30947     1     7 alabama      <NA>
## 8  -87.59379 30.28655     1     8 alabama      <NA>
## 9  -87.67400 30.27509     1     9 alabama      <NA>
## 10 -87.81152 30.25790     1    10 alabama      <NA>
beer_brew_df<-merge(x=beer_df,y=brewery_df,by.x="Brewery_id",by.y="Brew_ID",all.x = TRUE)
#beer_brew_df_map<-merge(x=brewery_df,y=us_states,by.x="State_full",by.y="region",all.x = TRUE)
beer_brew_df_map<-merge(x=count_by_state,y=us_states,by.x="Var1",by.y="region",all.x = TRUE)
#count_by_state
#names(beer_brew_df_map)

#beer_brew_df_map

p <- ggplot(data = beer_brew_df_map,
            aes(x = long, y = lat,
                group = group, fill = Freq))
#p + geom_polygon(color = "gray90", size = 0.1) + guides(fill = FALSE)
p + geom_polygon(color = "gray90", size = 0.1) +
  coord_map(projection = "albers", lat0 = 39, lat1 = 45) + 
  labs(title = "Brewery Distribution") +
  theme_map()

#2. Merge beer data with the breweries data. Print the first 6 observations and the last six observations to check the merged file.

beer_brew_df<-merge(x=beer_df,y=brewery_df,by.x="Brewery_id",by.y="Brew_ID",all.x = TRUE)
head(beer_brew_df,6)
##   Brewery_id        Name.x Beer_ID   ABV IBU
## 1          1  Get Together    2692 0.045  50
## 2          1 Maggie's Leap    2691 0.049  26
## 3          1    Wall's End    2690 0.048  19
## 4          1       Pumpion    2689 0.060  38
## 5          1    Stronghold    2688 0.060  25
## 6          1   Parapet ESB    2687 0.056  47
##                                 Style Ounces             Name.y
## 1                        American IPA     16 NorthGate Brewing 
## 2                  Milk / Sweet Stout     16 NorthGate Brewing 
## 3                   English Brown Ale     16 NorthGate Brewing 
## 4                         Pumpkin Ale     16 NorthGate Brewing 
## 5                     American Porter     16 NorthGate Brewing 
## 6 Extra Special / Strong Bitter (ESB)     16 NorthGate Brewing 
##          City State State_full
## 1 Minneapolis    MN  minnesota
## 2 Minneapolis    MN  minnesota
## 3 Minneapolis    MN  minnesota
## 4 Minneapolis    MN  minnesota
## 5 Minneapolis    MN  minnesota
## 6 Minneapolis    MN  minnesota
tail(beer_brew_df,6)
##      Brewery_id                    Name.x Beer_ID   ABV IBU
## 2405        556             Pilsner Ukiah      98 0.055  NA
## 2406        557  Heinnieweisse Weissebier      52 0.049  NA
## 2407        557           Snapperhead IPA      51 0.068  NA
## 2408        557         Moo Thunder Stout      50 0.049  NA
## 2409        557         Porkslap Pale Ale      49 0.043  NA
## 2410        558 Urban Wilderness Pale Ale      30 0.049  NA
##                        Style Ounces                        Name.y
## 2405         German Pilsener     12         Ukiah Brewing Company
## 2406              Hefeweizen     12       Butternuts Beer and Ale
## 2407            American IPA     12       Butternuts Beer and Ale
## 2408      Milk / Sweet Stout     12       Butternuts Beer and Ale
## 2409 American Pale Ale (APA)     12       Butternuts Beer and Ale
## 2410        English Pale Ale     12 Sleeping Lady Brewing Company
##               City State State_full
## 2405         Ukiah    CA california
## 2406 Garrattsville    NY   new york
## 2407 Garrattsville    NY   new york
## 2408 Garrattsville    NY   new york
## 2409 Garrattsville    NY   new york
## 2410     Anchorage    AK     alaska

#3. Report the number of NA’s in each column.

summary(beer_df$ABV) #62 na
##    Min. 1st Qu.  Median    Mean 3rd Qu.    Max.    NA's 
## 0.00100 0.05000 0.05600 0.05977 0.06700 0.12800      62
summary(beer_df$IBU) #1005 na
##    Min. 1st Qu.  Median    Mean 3rd Qu.    Max.    NA's 
##    4.00   21.00   35.00   42.71   64.00  138.00    1005
sapply(beer_brew_df, function(x) sum(is.na(x))) #another check
## Brewery_id     Name.x    Beer_ID        ABV        IBU      Style 
##          0          0          0         62       1005          0 
##     Ounces     Name.y       City      State State_full 
##          0          0          0          0          0
#optional remove NA's
cmerge <- beer_brew_df[complete.cases(beer_brew_df),]
#sapply(cmerge, function(x) sum(is.na(x))) #to check cmerge for NA's

#filter by abv/ibu to get boxes for slide 8
#less than 100 IBU, 10% ABV
bbox<- subset(cmerge, cmerge$ABV <0.1 & cmerge$IBU < 100)
#1353/1405, 96.2%
#less than 50 IBU, 7.5% ABV
rbox <- subset(cmerge, cmerge$ABV < 0.06 &cmerge$ABV > 0.03 & cmerge$IBU <50)
#740/1405, 52.6%

There are 62 beers with no reported ABV, and 1005 beers with no reported IBU

#4. Compute the median alcohol content and international bitterness unit for each state. Plot a bar chart to compare.

#library(dplyr)
#beer_brew_df %>%
#  group_by(State)%>% 
#  summarise(Mean=mean(ABV,na.rm=TRUE), Max=max(ABV,na.rm=TRUE), Min=min(ABV,na.rm=TRUE), Median=median(ABV,na.rm=TRUE), Std=sd(ABV,na.rm=TRUE))

library(dplyr)
## 
## Attaching package: 'dplyr'
## The following objects are masked from 'package:stats':
## 
##     filter, lag
## The following objects are masked from 'package:base':
## 
##     intersect, setdiff, setequal, union
median_IBU_by_state <- beer_brew_df %>%
  group_by(State_full)%>% 
  summarise(Median=median(IBU,na.rm=TRUE))

median_IBU_by_state[is.na(median_IBU_by_state$Median),]
## # A tibble: 1 x 2
##   State_full   Median
##   <fct>         <dbl>
## 1 south dakota     NA
# *** state "SD" no Median, All IBU are NA in SD
distinct(beer_brew_df,State)
##    State
## 1     MN
## 2     KY
## 3     MA
## 4     CA
## 5     SC
## 6     CO
## 7     MI
## 8     IN
## 9     TX
## 10    AZ
## 11    MO
## 12    WI
## 13    IL
## 14    VT
## 15    ME
## 16    PA
## 17    KS
## 18    NY
## 19    NH
## 20    GA
## 21    VA
## 22    FL
## 23    MD
## 24    NC
## 25    WY
## 26    OR
## 27    RI
## 28    CT
## 29    OH
## 30    AK
## 31    MS
## 32    AR
## 33    LA
## 34    WV
## 35    UT
## 36    ID
## 37    WA
## 38    OK
## 39    NE
## 40    HI
## 41    IA
## 42    SD
## 43    NJ
## 44    MT
## 45    DC
## 46    NV
## 47    TN
## 48    NM
## 49    AL
## 50    DE
## 51    ND
beer_brew_df[beer_brew_df$State=='SD',]
##      Brewery_id                    Name.x Beer_ID   ABV IBU
## 1237        213 Red Water Irish Style Red    2145 0.065  NA
## 1238        213                  Mjöllnir    1804 0.066  NA
## 1239        213  Bear Butte Nut Brown Ale    1602 0.055  NA
## 1240        213    Easy Livin' Summer Ale    1301 0.045  NA
## 1241        213          Canyon Cream Ale     542 0.055  NA
## 1242        213        Pile O'Dirt Porter     272 0.069  NA
## 1243        213             11th Hour IPA     271 0.060  NA
##                         Style Ounces                    Name.y      City
## 1237 American Amber / Red Ale     12 Crow Peak Brewing Company Spearfish
## 1238     Herbed / Spiced Beer     12 Crow Peak Brewing Company Spearfish
## 1239       American Brown Ale     12 Crow Peak Brewing Company Spearfish
## 1240      American Blonde Ale     12 Crow Peak Brewing Company Spearfish
## 1241                Cream Ale     12 Crow Peak Brewing Company Spearfish
## 1242          American Porter     12 Crow Peak Brewing Company Spearfish
## 1243             American IPA     12 Crow Peak Brewing Company Spearfish
##      State   State_full
## 1237    SD south dakota
## 1238    SD south dakota
## 1239    SD south dakota
## 1240    SD south dakota
## 1241    SD south dakota
## 1242    SD south dakota
## 1243    SD south dakota
ggplot(median_IBU_by_state, aes(x=reorder(State_full,-Median),y=Median,fill=State_full)) + 
  geom_bar(stat="identity") + 
  labs(title="IBU Median by State",subtitle="",x="State",y="Median",caption="") +
  theme(plot.title=element_text(hjust=0.5),axis.text.x = element_text(angle = 90, hjust = 1),legend.position="none")
## Warning: Removed 1 rows containing missing values (position_stack).

National Average ABV is 5.9% National Average IBU is 42.7

#5. Which state has the maximum alcoholic (ABV) beer? Which state has the most bitter (IBU) beer?

ABV_by_state <- beer_brew_df %>%
  group_by(State_full)%>% 
  summarise(Mean=mean(ABV,na.rm=TRUE),Median=median(ABV,na.rm=TRUE))

ABV_by_state
## # A tibble: 51 x 3
##    State_full             Mean Median
##    <fct>                 <dbl>  <dbl>
##  1 alabama              0.062  0.06  
##  2 alaska               0.0556 0.056 
##  3 arizona              0.0602 0.055 
##  4 arkansas             0.052  0.052 
##  5 california           0.0611 0.058 
##  6 colorado             0.0634 0.0605
##  7 connecticut          0.0611 0.06  
##  8 delaware             0.055  0.055 
##  9 district of columbia 0.0656 0.0625
## 10 florida              0.0599 0.057 
## # … with 41 more rows
IBU_by_state <- beer_brew_df %>%
  group_by(State_full)%>% 
  summarise(Mean=mean(IBU,na.rm=TRUE),Median=median(IBU,na.rm=TRUE))

IBU_by_state
## # A tibble: 51 x 3
##    State_full            Mean Median
##    <fct>                <dbl>  <dbl>
##  1 alabama               51.2   43  
##  2 alaska                40.9   46  
##  3 arizona               35.2   20.5
##  4 arkansas              39     39  
##  5 california            46.3   42  
##  6 colorado              47.4   40  
##  7 connecticut           40.8   29  
##  8 delaware              52     52  
##  9 district of columbia  55.2   47.5
## 10 florida               46.8   55  
## # … with 41 more rows
#Top 10 ABV state (Mean)
top_n(ABV_by_state, n=10, Mean) %>%
  ggplot(., aes(x=reorder(State_full,-Mean),y=Mean,fill=State_full)) + 
  geom_bar(stat="identity") + 
  labs(title="Top 10 ABV Average by State",subtitle="",x="State",y="Average",caption="") +
  theme(plot.title=element_text(hjust=0.5),axis.text.x = element_text(angle = 90, hjust = 1),legend.position="none")

#Top 10 IBU state (Mean)
top_n(IBU_by_state, n=10, Mean) %>%
  ggplot(., aes(x=reorder(State_full,-Mean),y=Mean,fill=State_full)) + 
  geom_bar(stat="identity") + 
  labs(title="Top 10 IBU Average by State",subtitle="",x="State",y="Average",caption="") +
  theme(plot.title=element_text(hjust=0.5),axis.text.x = element_text(angle = 90, hjust = 1),legend.position="none")

#--------

summary(beer_brew_df)
##    Brewery_id                       Name.x        Beer_ID      
##  Min.   :  1.0   Nonstop Hef Hop       :  12   Min.   :   1.0  
##  1st Qu.: 94.0   Dale's Pale Ale       :   6   1st Qu.: 808.2  
##  Median :206.0   Oktoberfest           :   6   Median :1453.5  
##  Mean   :232.7   Longboard Island Lager:   4   Mean   :1431.1  
##  3rd Qu.:367.0   1327 Pod's ESB        :   3   3rd Qu.:2075.8  
##  Max.   :558.0   Boston Lager          :   3   Max.   :2692.0  
##                  (Other)               :2376                   
##       ABV               IBU                                    Style     
##  Min.   :0.00100   Min.   :  4.00   American IPA                  : 424  
##  1st Qu.:0.05000   1st Qu.: 21.00   American Pale Ale (APA)       : 245  
##  Median :0.05600   Median : 35.00   American Amber / Red Ale      : 133  
##  Mean   :0.05977   Mean   : 42.71   American Blonde Ale           : 108  
##  3rd Qu.:0.06700   3rd Qu.: 64.00   American Double / Imperial IPA: 105  
##  Max.   :0.12800   Max.   :138.00   American Pale Wheat Ale       :  97  
##  NA's   :62        NA's   :1005     (Other)                       :1298  
##      Ounces                             Name.y               City     
##  Min.   : 8.40   Brewery Vivant            :  62   Grand Rapids:  66  
##  1st Qu.:12.00   Oskar Blues Brewery       :  46   Portland    :  64  
##  Median :12.00   Sun King Brewing Company  :  38   Chicago     :  55  
##  Mean   :13.59   Cigar City Brewing Company:  25   Indianapolis:  43  
##  3rd Qu.:16.00   Sixpoint Craft Ales       :  24   San Diego   :  42  
##  Max.   :32.00   Hopworks Urban Brewery    :  23   Boulder     :  41  
##                  (Other)                   :2192   (Other)     :2099  
##     State                State_full  
##  Length:2410        colorado  : 265  
##  Class :character   california: 183  
##  Mode  :character   michigan  : 162  
##                     indiana   : 139  
##                     texas     : 130  
##                     oregon    : 125  
##                     (Other)   :1406
median_ABV_by_state <- beer_brew_df %>%
  group_by(State_full)%>% 
  summarise(Median=median(ABV,na.rm=TRUE))

#Top 10 ABV state
top_n(median_ABV_by_state, n=10, Median) %>%
ggplot(., aes(x=reorder(State_full,-Median),y=Median,fill=State_full)) + 
  geom_bar(stat="identity") + 
  labs(title="Top 10 ABV Median by State",subtitle="",x="State",y="Median",caption="") +
  theme(plot.title=element_text(hjust=0.5),axis.text.x = element_text(angle = 90, hjust = 1),legend.position="none") #+ geom_smooth(method = "lm")

#Top 10 IBU state
top_n(median_IBU_by_state, n=10, Median) %>%
  ggplot(., aes(x=reorder(State_full,-Median),y=Median,fill=State_full)) + 
  geom_bar(stat="identity") + 
  labs(title="Top 10 IBU Median by State",subtitle="",x="State",y="Median",caption="") +
  theme(plot.title=element_text(hjust=0.5),axis.text.x = element_text(angle = 90, hjust = 1),legend.position="none")

The highest national ABV is (name of beer),(some percent%) alcohol by volume, produced in (state) The highest national IBU is (name of beer),(some rating) bitterness rating, produced in (state)

#6. Summary statistics for the ABV variable.

summary(beer_brew_df$ABV)
##    Min. 1st Qu.  Median    Mean 3rd Qu.    Max.    NA's 
## 0.00100 0.05000 0.05600 0.05977 0.06700 0.12800      62

#7. Is there an apparent relationship between the bitterness of the beer and its alcoholic content? Draw a scatter plot.

ggplot(beer_brew_df, aes(x=ABV, y=IBU)) + 
  geom_point(shape=18, color="blue")+
  geom_smooth(method=lm,  linetype="dashed",
              color="darkred", fill="blue")
## Warning: Removed 1005 rows containing non-finite values (stat_smooth).
## Warning: Removed 1005 rows containing missing values (geom_point).

#Extra: ABV, IBU by style

beer_df$Style_cat='Unknown'

#clean special characters 
beer_df$Style <- gsub('Kölsch', 'Kolsch', beer_df$Style)
beer_df$Style <- gsub('Märzen', 'Marzen', beer_df$Style)
beer_df$Style <- gsub('Bière', 'Biere', beer_df$Style)

#IPA
beer_df$Style_cat <- ifelse(grepl("India Pale Ale", ignore.case=TRUE, beer_df$Style), "IPA", beer_df$Style_cat)
beer_df$Style_cat <- ifelse(grepl("IPA", ignore.case=TRUE, beer_df$Style), "IPA", beer_df$Style_cat)
#Porters
beer_df$Style_cat <- ifelse(grepl("porter", ignore.case=TRUE, beer_df$Style), "Porters", beer_df$Style_cat)
#Stouts
beer_df$Style_cat <- ifelse(grepl("stout", ignore.case=TRUE, beer_df$Style), "Stouts", beer_df$Style_cat)

#Strong Ales
strong_ales_key <-c('red ale','wine ale', 'old ale', 'wine', 'strong ale','quadrupel','tripel','Scotch Ale','Wee Heavy')
beer_df$Style_cat <- ifelse(grepl(paste(strong_ales_key,collapse="|"), ignore.case=TRUE, beer_df$Style), "Strong Ales", beer_df$Style_cat)

#Wheat Beers
wheat_beer_key <- c('wheat', 'Witbier', 'Weissbier','Dunkelweizen','Hefeweizen')
beer_df$Style_cat <- ifelse(grepl(paste(wheat_beer_key,collapse="|"), ignore.case=TRUE, beer_df$Style), "Wheat Beers", beer_df$Style_cat)

#Wild or Sour Beers
wild_beer_key <-c('Gose', 'wild ale','oud bruin','Flanders red ale')
beer_df$Style_cat <- ifelse(grepl(paste(wild_beer_key,collapse="|"), ignore.case=TRUE, beer_df$Style), "Wild or Sour Beers", beer_df$Style_cat)

#Hybrid Beers
hybrid_beer_key <- c('california common', 'Garde', 'cream ale','kolsch','Altbier','braggot')
beer_df$Style_cat <- ifelse(grepl(paste(hybrid_beer_key,collapse="|"), ignore.case=TRUE, beer_df$Style), "Hybrid Beers", beer_df$Style_cat)

#Dark Lagers
dark_key <-c('schwarzbier','Marzen','Oktoberfest')
beer_df$Style_cat <- ifelse(grepl(paste(dark_key,collapse="|"), ignore.case=TRUE, beer_df$Style), "Dark Lagers", beer_df$Style_cat)

#Bocks
bock_key<-c('Weizen','bock')
beer_df$Style_cat <- ifelse(grepl(paste(bock_key,collapse="|"), ignore.case=TRUE, beer_df$Style), "Bocks", beer_df$Style_cat)

#Dark Ales
dark_ale_key<-c('black ale', 'dark ale', 'dubbel', 'roggenbier', 'scottish ale', 'winter warmer')
beer_df$Style_cat <- ifelse(grepl(paste(dark_ale_key,collapse="|"), ignore.case=TRUE, beer_df$Style), "Dark Ales", beer_df$Style_cat)

#Brown Ales
brown_key<-c('Brown Ale','Oud Bruin','Mild')
beer_df$Style_cat <- ifelse(grepl(paste(brown_key,collapse="|"), ignore.case=TRUE, beer_df$Style), "Brown Ales", beer_df$Style_cat)

#Specialty Beers
spec_key<-c('Shandy','cider','mead','radler','chile beer','low alcohol beer','black ale','Pumpkin Ale', 'Rye Beer', 'Smoked Beer', 'rauchbier', 'herbed','spiced','Fruit','Vegetable')
beer_df$Style_cat <- ifelse(grepl(paste(spec_key,collapse="|"), ignore.case=TRUE, beer_df$Style), "Specialty Beers", beer_df$Style_cat)

#Pale Ales
pal_key<-c('blonde ale', 'bitter', 'ESB', 'pale ale', 'Abbey Single Ale','saison','grisette')
beer_df$Style_cat <- ifelse(grepl(paste(pal_key,collapse="|"), ignore.case=TRUE, beer_df$Style), "Pale Ales", beer_df$Style_cat)

#Pilsener and Pale Lager
pil_key<-c('pilsener', 'pilsner','Helles','Keller Bier','Zwickel Bier','Malt Liquor','american lager')
beer_df$Style_cat <- ifelse(grepl(paste(pil_key,collapse="|"), ignore.case=TRUE, beer_df$Style), 'Pilsener and Pale Lager', beer_df$Style_cat)

#Lager
lager_key<-c(' lager')
beer_df$Style_cat <- ifelse(grepl(paste(lager_key,collapse="|"), ignore.case=TRUE, beer_df$Style), "Lager", beer_df$Style_cat)

beer_brew_df<-merge(x=beer_df,y=brewery_df,by.x="Brewery_id",by.y="Brew_ID",all.x = TRUE)
ABV_by_style <- beer_brew_df %>%
  group_by(Style_cat)%>% 
  summarise(Mean=mean(ABV,na.rm=TRUE),Median=median(ABV,na.rm=TRUE))

ABV_by_style
## # A tibble: 16 x 3
##    Style_cat                 Mean Median
##    <chr>                    <dbl>  <dbl>
##  1 Bocks                   0.0558 0.052 
##  2 Brown Ales              0.0562 0.055 
##  3 Dark Ales               0.0709 0.0695
##  4 Dark Lagers             0.0568 0.055 
##  5 Hybrid Beers            0.0547 0.054 
##  6 IPA                     0.0689 0.068 
##  7 Lager                   0.0501 0.05  
##  8 Pale Ales               0.0551 0.054 
##  9 Pilsener and Pale Lager 0.0518 0.05  
## 10 Porters                 0.0618 0.06  
## 11 Specialty Beers         0.0597 0.059 
## 12 Stouts                  0.0687 0.0635
## 13 Strong Ales             0.0647 0.058 
## 14 Unknown                 0.0493 0.05  
## 15 Wheat Beers             0.0499 0.05  
## 16 Wild or Sour Beers      0.0563 0.05
IBU_by_style <- beer_brew_df %>%
  group_by(Style_cat)%>% 
  summarise(Mean=mean(IBU,na.rm=TRUE),Median=median(IBU,na.rm=TRUE))

IBU_by_style
## # A tibble: 16 x 3
##    Style_cat                Mean Median
##    <chr>                   <dbl>  <dbl>
##  1 Bocks                    18.6   16  
##  2 Brown Ales               27.9   25  
##  3 Dark Ales                27.5   22.5
##  4 Dark Lagers              24.2   24  
##  5 Hybrid Beers             25.6   23  
##  6 IPA                      72.3   70  
##  7 Lager                    22.9   19  
##  8 Pale Ales                37.2   36  
##  9 Pilsener and Pale Lager  31.3   30  
## 10 Porters                  33.5   32  
## 11 Specialty Beers          34.5   20  
## 12 Stouts                   45.0   38  
## 13 Strong Ales              39.0   30.5
## 14 Unknown                  21.7   22  
## 15 Wheat Beers              19.3   18  
## 16 Wild or Sour Beers       13.3   10
#Top 10 ABV Style (Mean)
top_n(ABV_by_style, n=99, Mean) %>%
  ggplot(., aes(x=reorder(Style_cat,-Mean),y=Mean,fill=Style_cat)) + 
  geom_bar(stat="identity") + 
  labs(title="Top 10 ABV Average by Style",subtitle="",x="Style",y="Average",caption="") +
  theme(plot.title=element_text(hjust=0.5),axis.text.x = element_text(angle = 90, hjust = 1),legend.position="none")

#Top 10 IBU Style (Mean)
top_n(IBU_by_style, n=99, Mean) %>%
  ggplot(., aes(x=reorder(Style_cat,-Mean),y=Mean,fill=Style_cat)) + 
  geom_bar(stat="identity") + 
  labs(title="Top 10 IBU Average by Style",subtitle="",x="Style",y="Average",caption="") +
  theme(plot.title=element_text(hjust=0.5),axis.text.x = element_text(angle = 90, hjust = 1),legend.position="none")

##extra2 style pie chart

#install.packages("ggforce")

library(dplyr)
library(ggforce)
library(plotly)
## 
## Attaching package: 'plotly'
## The following object is masked from 'package:ggplot2':
## 
##     last_plot
## The following object is masked from 'package:stats':
## 
##     filter
## The following object is masked from 'package:graphics':
## 
##     layout
# cleaner code by Joe
count_by_style_cat <- data.frame(table(beer_brew_df$Style_cat))
names(count_by_style_cat)<-c("Style_cat","Freq")
percent_by_style_cat <- data.frame(count_by_style_cat$Style_cat, count_by_style_cat$Freq/sum(count_by_style_cat$Freq)*100)
names(percent_by_style_cat) <- c("Style", "percent")
percent_by_style_cat
##                      Style    percent
## 1                    Bocks  2.4481328
## 2               Brown Ales  4.0663900
## 3                Dark Ales  2.4066390
## 4              Dark Lagers  1.6182573
## 5             Hybrid Beers  2.3236515
## 6                      IPA 23.1535270
## 7                    Lager  6.5560166
## 8                Pale Ales 20.2074689
## 9  Pilsener and Pale Lager  4.1493776
## 10                 Porters  3.0705394
## 11         Specialty Beers  7.8838174
## 12                  Stouts  4.1493776
## 13             Strong Ales  8.3402490
## 14                 Unknown  1.9917012
## 15             Wheat Beers  6.9294606
## 16      Wild or Sour Beers  0.7053942
p <- plot_ly(percent_by_style_cat, labels = ~Style, values = ~percent, type = 'pie',textposition = 'inside',textinfo = 'label+percent') %>%
  layout(title = 'Style Distribution',
         xaxis = list(showgrid = FALSE, zeroline = FALSE, showticklabels = FALSE),
         yaxis = list(showgrid = FALSE, zeroline = FALSE, showticklabels = FALSE))
p
####################
# 
style1 <- beer_df$style

popular.styles <- table(unlist(beer_df$Style))
#want to get this in order, descending
#maybe dataframe can sort and keep label
pop.df <- as.data.frame(popular.styles)
names(pop.df) <- c("Style", "Count")
popular.styles <- pop.df[order(pop.df$Count, decreasing = TRUE),]
#ok got it
pstyles <- popular.styles

#calculate percentage of reported data per style
plabel <- data.frame(pstyles$Style, pstyles$Count/sum(pstyles$Count)*100)
names(plabel) <- c("Style", "percent")

p <- plot_ly(pstyles, labels = pstyles$Style, values = plabel$percent, type = 'pie',textposition = 'outside',textinfo = 'label+percent') %>%
  layout(title = 'Style Distribution',
         xaxis = list(showgrid = FALSE, zeroline = FALSE, showticklabels = FALSE),
         yaxis = list(showgrid = FALSE, zeroline = FALSE, showticklabels = FALSE))
#too many one instance styles, trim?

pstyles.trim <- pstyles[1:30,]
plabel.trim <- plabel[1:30, ]
p2 <- plot_ly(pstyles.trim, labels = pstyles.trim$Style, values = plabel.trim$percent, type = 'pie',textposition = 'outside',textinfo = 'label+percent') %>%
  layout(title = 'Style Distribution',
         xaxis = list(showgrid = FALSE, zeroline = FALSE, showticklabels = FALSE),
         yaxis = list(showgrid = FALSE, zeroline = FALSE, showticklabels = FALSE))
count_by_style_cat <- data.frame(table(beer_brew_df$Style_cat))
names(count_by_style_cat)<-c("Style_cat","Freq")
percent_by_style_cat <- data.frame(count_by_style_cat$Style_cat, count_by_style_cat$Freq/sum(count_by_style_cat$Freq)*100)
names(percent_by_style_cat) <- c("Style", "percent")
percent_by_style_cat
##                      Style    percent
## 1                    Bocks  2.4481328
## 2               Brown Ales  4.0663900
## 3                Dark Ales  2.4066390
## 4              Dark Lagers  1.6182573
## 5             Hybrid Beers  2.3236515
## 6                      IPA 23.1535270
## 7                    Lager  6.5560166
## 8                Pale Ales 20.2074689
## 9  Pilsener and Pale Lager  4.1493776
## 10                 Porters  3.0705394
## 11         Specialty Beers  7.8838174
## 12                  Stouts  4.1493776
## 13             Strong Ales  8.3402490
## 14                 Unknown  1.9917012
## 15             Wheat Beers  6.9294606
## 16      Wild or Sour Beers  0.7053942
p <- plot_ly(percent_by_style_cat, labels = ~Style, values = ~percent, type = 'pie',textposition = 'inside',textinfo = 'label+percent') %>%
  layout(title = 'Style Distribution',
         xaxis = list(showgrid = FALSE, zeroline = FALSE, showticklabels = FALSE),
         yaxis = list(showgrid = FALSE, zeroline = FALSE, showticklabels = FALSE))

big high level conclusion animated slide at the end, conclusion, thank you for the opportunity to present, deeper questions can be reached at email/contact informations. slide design good keep to talking points on the left, let visuals speak for themselves polish and practice, 5 times walkthrough, then video till it has no flaws plot a median line on charts where it is a relevent talking point

analysis factors, add percentage of missing data beers